VAST Challenge 3
With reference to Challenge 3 of VAST Challenge 2022, you are required to reveal the economic of the city of Engagement, Ohio USA by using appropriate static and interactive statistical graphics methods
This exercise requires us to apply the skills you had learned in
Lesson 1 and Hands-on Exercise 1 to reveal the demographic of the city
of Engagement, Ohio USA by using appropriate static statistical graphics
methods. The data should be processed by using appropriate tidyverse
family of packages and the statistical graphics must be prepared using
ggplot2 and its extensions.
packages = c('ggiraph', 'plotly', 'tidyverse', 'DT','gganimate',
'knitr', 'ggdist', 'scales', 'grid', 'gridExtra',
'patchwork','ggsignif','gghighlight',"hrbrthemes",
'readxl', 'gifski', 'gapminder','treemap', 'treemapify',
'rPackedBar','ggridges','rmarkdown','crosstalk',
'd3scatter','tidycensus','timetk','ggseas','lubridate',
'ggrepel','GGally')
for(p in packages) {
if(!require(p, character.only = T)) {
install.packages(p)
}
library(p, character.only = T)
}
financial <- read_csv('./data/FinancialJournal.csv')
participant_data <- read_csv('./data/Participants.csv')
participant_data$educationLevel<-factor(participant_data$educationLevel,ordered=TRUE,levels=c('Low','HighSchoolOrCollege',"Bachelors","Graduate"))
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
participant_data$Age_Group <- cut(participant_data$age, breaks=brks, labels = grps, right = FALSE)
glimpse(financial)
Rows: 1,856,330
Columns: 4
$ participantId <dbl> 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5…
$ timestamp <dttm> 2022-03-01, 2022-03-01, 2022-03-01, 2022-03-0…
$ category <chr> "Wage", "Shelter", "Education", "Wage", "Shelt…
$ amount <dbl> 2472.50756, -554.98862, -38.00538, 2046.56221,…
Our data includes two csv files from the VAST data source, namely FinancialJournal.csv and Participants.csv. To show the financial health of Ohio city, we derived three supporting tables from the original data. Generally, we want to ananlyze the spending habits and wage status of people with different education background, age, and household size.
financial$DateTime <- as.POSIXct(financial$timestamp, format="%Y-%m-%d %H:%M:%S")
financial$year <- format(financial$DateTime, format="%Y")
financial$month <- format(financial$DateTime, format="%m")
financial$day <- format(financial$DateTime, format="%d")
financial$hour <- format(financial$DateTime, format="%H")
financial$minute <- format(financial$DateTime, format="%M")
financial$second <- format(financial$DateTime, format="%S")
financial$date <- format(financial$DateTime, format="%Y %b")
monthly_income <- financial %>%
filter(category %in% c('Wage', 'RentAdjustment')) %>%
group_by(month) %>%
summarise(income = sum(amount))
monthly_spend <- financial %>%
filter(!category %in% c('Wage', 'RentAdjustment')) %>%
group_by(month) %>%
summarise(spend = sum(abs(amount)))
monthly_finance_status <- merge(monthly_income,monthly_spend,by=c("month"))
monthly_finance_status$spendRatio <- monthly_finance_status$spend / monthly_finance_status$income
monthly_finance_status$remain <- (monthly_finance_status$income - monthly_finance_status$spend)
monthly_finance_status$remain <- round(monthly_finance_status$remain, 1)
monthly_finance_status <- monthly_finance_status[order(as.Date(monthly_finance_status$month, format="%m")),]
paged_table(monthly_finance_status, options = list(rows.print = 15, cols.print = 5))
spending_monthly_cat <- financial %>%
filter(!category %in% c('Wage', 'RentAdjustment')) %>%
group_by(month, category) %>%
summarise(spend_cat = - sum(amount))
cost_ratio <- merge(spending_monthly_cat, monthly_finance_status, by=c("month"))
cost_ratio$spend_cat_ratio = cost_ratio$spend_cat / cost_ratio$income
paged_table(cost_ratio, options = list(rows.print = 15, cols.print = 5))
income_monthly_cat <- financial %>%
filter(category %in% c('Wage', 'RentAdjustment')) %>%
group_by(month, category) %>%
summarise(income_cat = round(sum(amount),1))
paged_table(income_monthly_cat, options = list(rows.print = 15, cols.print = 5))
To compare wage with the overall costs. We computed the fraction of spending versus wage. As we can see, most people spend 30%-36% of their income per month, and save the 70%. And people spend more and earn more money in Mar-May. It seems like the overall costs vesus wages is highest in April at 36%.
p1 <- ggplot(data=monthly_finance_status, aes(x=month,
y= spendRatio,
group=1,
text = paste('</br>Month: ', month,
'</br>Spend/Income: ', 100*round(spendRatio,4),"%"))) +
geom_point()+
geom_line(stat = "identity")+
geom_hline(yintercept=0.36, alpha = 0.3, linetype = 2) +
geom_hline(yintercept=0.316, alpha = 0.3, linetype = 2) +
labs(y= 'Spend/income ratio', x= 'Month',
title = "Fig 1.1 Spend/income (monthly)",
subtitle = "People spend 30%-35% of their income")
#geom_bar(position="dodge2", stat = "identity")
p2 <- ggplot(data=monthly_finance_status, aes(x=month,
y= spend,
group=1,
text = paste('</br>Month: ', month,
'</br>Spend/Income: ', 100*round(spendRatio,4),"%"))) +
geom_point()+
geom_line(stat = "identity")+
scale_y_continuous(labels = dollar)+
labs(y= 'Spend/income ratio', x= 'Month',
title = "Fig 1.2 Spend (monthly)",
subtitle = "Most people spending surges in Mar-May")
#ggplotly(p4,tooltip = "text")
p3 <- ggplot(data=monthly_finance_status, aes(x=month,
y= income,
group=1,
text = paste('</br>Month: ', month,
'</br>Spend/Income: ', 100*round(spendRatio,4),"%"))) +
geom_point()+
geom_line(stat = "identity")+
scale_y_continuous(labels = dollar)+
labs(y= 'Spend/income ratio', x= 'Month',
title = "Fig 1.3 Income (monthly)",
subtitle = "People's income surges in Mar-May")
p1|(p2/p3)
From the bar plot of total spending by category per month below, we can infer that the significant rise in spending in March-May may result from the increase of Shelter. Except March-May, the spending on each category is stable through time.
p2 <- ggplot(data=spending_monthly_cat, aes(x=month,
y= spend_cat,
color=category,
fill = category,
group=category,
text = paste('</br>Month: ', month,
'</br>Spending: ', round(spend_cat,0),
'</br>Category: ', category))) +
guides(fill = "none") +
geom_bar(stat='identity')+
facet_wrap(~ category) +
scale_y_continuous(labels = dollar)+
labs(y= 'Sum of spending', x= 'Month',
title = "Fig 2. Sum of spending by category (per month)",
subtitle = "Spending on shelter increase sharply in March-May")
#geom_bar(position="dodge2", stat = "identity") +
# facet_grid(category~. )
ggplotly(p2,tooltip="text")
And we can see that residents in Ohio spend nearly half of their spending on Shelter.
pie_data <- financial %>%
filter(!category %in% c('Wage', 'RentAdjustment')) %>%
group_by(category) %>%
summarise(spend = sum(-amount))
pie_data$fraction <- pie_data$spend / sum(pie_data$spend)
pie_data <- pie_data %>%
arrange(fraction) %>%
mutate(labels = paste0(category,'\n',scales::percent(fraction)))
paged_table(pie_data, options = list(rows.print = 15, cols.print = 5))
p3 <- ggplot(data=pie_data, aes(x="", y=fraction, fill=category)) +
geom_bar(width=1, stat="identity") +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5)) +
labs(y= 'Fraction of spending', x= 'Month',
title = "Fig 3. Fraction of spending by category(total)",
subtitle = "The largest part of spending is for 'Shelter'")+
coord_polar("y", start=0) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank())
p3
Financial health consists of two parts, namely income and spending. To measure it, we firstly computed the fraction of spending on each category versus wage per month and trying to find the patterns in people’s spending. Then, we draw a graph of income by category per month and potential factors that may affect wages such as age and education level.
Here is the line plot for the relative cost per category comparing to the participants’ income. As we can see, people spend a relatively low portion of their income in education, while spend large portions on shelter, food and recreation .
And we can see a trend that people are spending more on food, less on other categories.
p4<-ggplot(data=cost_ratio, aes(x=month,
y= spend_cat_ratio,
group=category,
color=category,
text = paste(
'</br>Category: ', category,
'</br>Month: ', month,
'</br>Ratio in Income: ', 100*round(spend_cat_ratio,4),"%"))) +
geom_point()+
geom_line(stat = "identity")+
geom_smooth(method = "loess", se = FALSE) +
facet_wrap(~ category, scales = "free_y") +
labs(y= 'Spend/income ratio', x= 'Month',
title = "Fig 4. Spend/income trend by category (monthly)",
subtitle = "People are spending more on food, less on other categories")
#geom_bar(position="dodge2", stat = "identity")
ggplotly(p4,tooltip="text")
The income of participants consists of two parts, namely Wage and RentAdjustment. We can see that most of their income is wage. Only a small portion is from RentAdjustment in March.
p5 <- ggplot(data=income_monthly_cat, aes(x=month,
y= income_cat,
group=category,
fill = category,
text = paste('</br>Category: ', category,
'</br>Month: ', month,
'</br>Income: ', income_cat
)))+
guides(fill = "none") +
geom_bar(stat='identity')+
scale_y_continuous(labels = dollar)+
labs(y= 'Sum of income', x= 'Month',
title = "Fig 5. Sum of income by category (per month)",
subtitle = "Most of participants's income comes from wages.\n And it stays stable excepet a surge in March-May")
#geom_bar(position="dodge2", stat = "identity") +
# facet_grid(category~. )
ggplotly(p5,tooltip="text")
wage <- financial %>%
filter(category == "Wage") %>%
group_by(participantId) %>%
summarise(wage = mean(amount))
brks <- c(0, 100, 200, 300, 400, Inf)
grps <- c('<=100', '101-200', '201-300', '301-400', '>400')
wage$Wage_Group <- cut(wage$wage, breaks=brks, labels = grps, right = FALSE)
wage <- financial %>%
filter(category == 'Wage') %>%
group_by(participantId) %>%
summarise(wage = round(mean(amount),0))
brks <- c(0, 100, 200, 300, 400, Inf)
grps <- c('<=100', '101-200', '201-300', '301-400', '>400')
wage$Wage_Group <- cut(wage$wage, breaks=brks, labels = grps, right = FALSE)
wage_info <- merge(wage, participant_data, by=c("participantId"))
paged_table(wage_info, options = list(rows.print = 15, cols.print = 5))
Most people People with higher education have a higher mean salary.
p6 <- ggplot(data=wage_info, aes(x = educationLevel,
y = wage))+
geom_boxplot(position="dodge",aes(x = educationLevel, y = wage)) +
stat_summary(geom = "point",
fun="mean",
colour ="red",
size=2) +
geom_hline(yintercept=173.38, alpha = 0.3, linetype = 2) +
# stat_summary(aes(label = round(..y.., 0)), fun=mean, geom = "label_repel", size=3, angle=150) +
labs(y= 'Wage', x= 'educationLevel',
title = "Fig6: Wage Distribution by Education Level",
subtitle = "Most people get 50 per month")
ggplotly(p6)
But age doesn’t influence wage much
p7 <- ggplot(data=wage_info, aes(x = wage, y = Age_Group, fill = 0.5 - abs(0.5 - stat(ecdf)))) +
stat_density_ridges(
geom = "density_ridges_gradient",
calc_ecdf = TRUE,
jittered_points = TRUE, quantile_lines = TRUE, scale = 0.9, alpha = 0.7,
vline_size = 1, vline_color = "red",
point_size = 0.4, point_alpha = 1,
position = position_raincloud(adjust_vlines = TRUE)
) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = expand_scale(mult = c(0, 0))) +
coord_cartesian(clip = "off") +
theme_ridges(grid = FALSE, center_axis_labels = TRUE)+
scale_fill_viridis_c(name = "Tail probability", direction = -1)+
ggtitle("Fig7. Wage distribution of in different Age Group")+
theme(plot.title = element_text(size = 12))
p7
We find similar spending habits among people with similar wage levels and education level.
personal_spending_cat <- financial %>%
filter(!category %in% c('Wage', 'RentAdjustment')) %>%
group_by(category, participantId) %>%
summarise(spend_cat = - sum(amount))
personal_spending_cat <- merge(personal_spending_cat, wage_info, by=c("participantId"))
personal_spending_cat$spend_ratio <- personal_spending_cat$spend_cat / personal_spending_cat$wage
paged_table(personal_spending_cat, options = list(rows.print = 15, cols.print = 5))
People who has high wage tend to spend less money on shelter and food.
p8 <- ggplot(personal_spending_cat, aes(x=wage, y=spend_cat, fill=Wage_Group)) +
facet_grid(category~. , scale = 'free_y') +
geom_point(size=2, shape=23) +
scale_y_continuous(labels = dollar) +
stat_ellipse() +
labs(y= 'Spend', x= 'Wage',
title = "Fig8: Relatiobship between wage and spending for all the participants",
subtitle = "People who has high wage tend to spend less money")
spending_cat_each_wage_group <- personal_spending_cat %>%
group_by(category,Wage_Group) %>%
summarise(spend_cat_mean = mean(spend_cat))
p8
People with high education tend to spend more money on education.
p9 <- ggplot(personal_spending_cat, aes(x=educationLevel, y=spend_cat)) +
facet_grid(category~. , scale = 'free_y') +
geom_violin() +
stat_summary(geom = "point",
fun="mean",
colour ="red",
size=2) +
stat_summary(aes(label = round(..y.., 0)), fun=mean, geom = "label_repel", size=3, angle=150) +
labs(y= 'Wage', x= 'educationLevel',
title = "Fig9: Spending habit within each education level",
subtitle = "People with high education tend to spend more money on education and shelter")
p9
Here are all the graphs for challenge3.2